
#######################
# Table 1 & Figure 1
#######################

library(readxl)
d <- as.data.frame(read_excel(paste0(path, "metadata/universe.xlsx"), sheet = 1))

dim(d)
head(d)
table(d$replicated)
table(d$replicated, d$reason)

d$all <- 1
d$targeted <- ifelse(d$replicated == 1 | d$reason %in% c("no code", "no data", "nonreplicable"), 1, 0)
d$hasdata <- ifelse(d$replicated == 1 | d$reason %in% c("no code", "nonrepliable"), 1, 0)
d$nodata <- ifelse(d$reason %in% c("no data"), 1, 0)
d$nocode <- ifelse(d$reason %in% c("no code"), 1, 0)
d$error <- ifelse(d$reason %in% c("nonreplicable"), 1, 0)
table(d$targeted)
table(d$hasdata, d$targeted)
table(d$nodata, d$targeted)
table(d$nocode, d$targeted)
table(d$error, d$targeted)
table(d$replicated, d$targeted)

# Table 1
a <- d[which(d$targeted == 1),]  # meeting our criteria

# table structure
tb1 <- matrix(NA, 3, 6)
colnames(tb1) <- c("all", "nodata", "nocode", "error", "success",  "pct")
rownames(tb1) <- c("ajps", "apsr", "jop")

tb1[, 1] <- table(a$journal) # first column
tb1[, 2] <- table(a$journal, a$nodata)[, 2] # second column
tb1[, 3] <- table(a$journal, a$nocode)[, 2] # third column
tb1[, 4] <- table(a$journal, a$error)[, 2] # fourth column
tb1[, 5] <- table(a$journal, a$replicated)[, 2] # fifth column
tb1 <- rbind(tb1, apply(tb1, 2, sum))
tb1[, 6] <- round(tb1[, 5]/tb1[,1]*100)
tb1 <- tb1[c(2,1,3,4), ]  
write.csv(tb1, file = "tables/table1.csv")

# Figure 1
year <- 2010:2022
all <- aggregate(d$all, list(d$year), FUN = sum)[, 2]
targeted <- aggregate(d$targeted, list(d$year), FUN = sum)[, 2]
hasdata <- aggregate(d$hasdata, list(d$year), FUN = sum)[, 2]
replicated <- aggregate(d$replicated, list(d$year), FUN = sum)[, 2]

s <- rbind(all, targeted, replicated)
s

sum(targeted) # universe
sum(hasdata)
sum(replicated)

pdf("graphs/Figure1.pdf", width = 12, height = 5)
par(lend = 1, mar = c(3, 4, 1, 1))
mycol <- c("gray30", "gray70", "gray90")
barplot(s,
  main = "", beside = TRUE, ylim = c(0, 25), col = mycol,
  names.arg = 2010:2022, ylab = "# Papers"
); box()
legend("top", c("All IV Papers", "Meeting Criteria", "Replicable"),
  cex = 1.2, col = mycol, lty = 1, lwd = 15, seg.len = 1, bty = "n",
  border = NA, ncol = 3
)
graphics.off()



#######################
# Table 2
#######################

load("metadata/iv_replicate.rds")
d$iv_subtype[which(d$iv_subtype %in% c("weather", "climate", "geography"))] <- "weather/climate/geography"
n <- nrow(d)

# table structure
tb2 <- matrix(NA, 14, 2)
colnames(tb2) <- c("number", "pct")
rownames(tb2) <- c("experiment", "rules", "fuzzy rd", "exposure", "theory", "weather/climate/geography",
                   "diffusion", "history", "others", "metrics", "inter", "lag", "test", "total")

# large category
a1 <- c(table(d$iv_type))
tb2[c(10, 1, 2, 5), 1] <- a1
tb2[14, 1] <- sum(a)

a2 <- table(d$iv_subtype[which(d$iv_type == "rules & policy changes")])
tb2[c(4, 3), 1] <- a2

a3 <- table(d$iv_subtype[which(d$iv_type == "theory")])
tb2[c(7, 8, 9, 6), 1] <- a3

a4 <- table(d$iv_subtype[which(d$iv_type == "econometrics")])
tb2[11:13, 1] <- a4

tb2[, 2] <- round(tb2[, 1]/70*100, 1)
write.csv(tb2, file = "tables/table2.csv")

#################
### Table 3
#################


load("metadata/iv_replicate.rds")


# replicated F
d$f_replicated <- d$f_standard
d$f_replicated[which(d$original_estimation_f_type == "robust")] <- d$f_robust[which(d$original_estimation_f_type == "robust")]
d$f_replicated

getStat <- function(var, method = "mean") {
  n <- nrow(d)
  out <- rep(NA, 3)
  if (method == "mean") {
    out[1] <- round(mean(var[which(d$iv_type == "experiment")], na.rm = TRUE), 3)
    out[2] <- round(mean(var[which(d$iv_type != "experiment")], na.rm = TRUE), 3)
    out[3] <- round(mean(var, na.rm = TRUE), 3)
  } else if (method == "median") {
    out[1] <- round(median(var[which(d$iv_type == "experiment")], na.rm = TRUE), 2)
    out[2] <- round(median(var[which(d$iv_type != "experiment")], na.rm = TRUE), 2)
    out[3] <- round(median(var, na.rm = TRUE), 2)
  }
  names(out) <- c("Experimental", "Obervational", "All")
  return(out)
}

# table structure
tb3 <- matrix(NA, 14, 3)
colnames(tb3) <- c("exp", "obs", "all")
rownames(tb3) <- c("unreported", "effF < 10", "bootF < 10", "med effF",
                   "p_orig insig", "p_AR insig", "p_bootc insig", "p_boott insig", 
                   "p_tF insig", "same_sign", "ratio > 1", "ratio > 5", "ratio > 10", 
                   "med ratio")

# Panel A: F stats
f.missing <- is.na(d$f_report)
tb3[1,] <- getStat(f.missing)

f.below10 <- (d$f_effective < 10)
tb3[2,] <- getStat(f.below10)

f.below10 <- (d$f_boot < 10)
tb3[3,] <- getStat(f.below10)

tb3[4,] <- getStat(d$f_effective, method = "median")


# Panel B: p values
d$iv_report_z <- d$iv_report_coef / d$iv_report_se
d$iv_report_p <- 2 * (1 - pnorm(abs(d$iv_report_z)))
p.report.insig <- d$iv_report_p > 0.05
tb3[5,] <- getStat(p.report.insig)

p.AR.insig <- (d$AR_p > 0.05 | d$AR_bounded == FALSE)
tb3[6,] <- getStat(p.AR.insig)

p.bootc.insig <- d$iv_bootc_p > 0.05
tb3[7,] <- getStat(p.bootc.insig)

p.boott.insig <- d$iv_boott_p > 0.05
tb3[8,] <- getStat(p.boott.insig)

p.tF.insig <- (d$tF_p > 0.05)
tb3[9,] <- getStat(p.tF.insig)


# Panel C. IV vs. OLS
samesign <- sign(d$iv_coef) == sign(d$ols_coef)
tb3[10,] <- getStat(samesign)

ratio.1 <- abs(d$iv_coef) >= 1 * abs(d$ols_coef)
tb3[11,] <- getStat(ratio.1)

ratio.5 <- abs(d$iv_coef) >= 5 * abs(d$ols_coef)
tb3[12,] <- getStat(ratio.5)

ratio.10 <- abs(d$iv_coef) >= 10 * abs(d$ols_coef)
tb3[13,] <- getStat(ratio.10)

ratio <- abs(d$iv_coef / d$ols_coef)
tb3[14,] <- getStat(ratio, method = "median")

write.csv(tb3, file = "tables/table3.csv")


#######################
# Figure 6
#######################

library(ivDiag)

## Rueda2017
df <- readRDS("./rawdata/ajps_Rueda_2017.rds")
D <- "lm_pob_mesa"
Y <- "e_vote_buying"
Z <- "lz_pob_mesa_f"
controls <- c("lpopulation", "lpotencial")
cl <- "muni_code"
FE <- NULL
weights <- NULL
(g <- ivDiag(data = df, Y = Y, D = D, Z = Z, controls = controls, FE = FE, cl = cl, weights = weights))

pdf("graphs/Figure6.pdf", width = 8, height = 4.5)
par(mar = c(4, 4, 1, 2))
plot_coef(g, ylim = c(-2, 0.5), main = "", ylab = "Treatment Effect Estimate")
graphics.off()
